In [1]:
from __future__ import division
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
Read in the data...
In [2]:
raw_data = pd.read_csv("/Users/Frankie/Documents/Dissertation/Data/Twitter/twitter_all.csv",\
names = ["symptom", "tweet"], encoding = 'iso-8859-1').fillna('Control')
raw_data[:5]
Out[2]:
Filter for Pain and Control tweets.
In [3]:
# remove duplicate tweets and filter out tweets just the tweets relating to pain
raw_data = raw_data.drop_duplicates()
pain = raw_data[(raw_data.symptom == 'Pain')]
# get a sample of control tweets
control = raw_data[(raw_data.symptom == 'Control')].sample(pain.shape[0])
raw_data = pd.concat((pain,control))
raw_data[:5]
Out[3]:
In [4]:
# need to reset the indices
raw_data.to_csv("/Users/Frankie/Documents/Dissertation/Data/Twitter/preprocessed/temp_df_nodup2.csv",index=False,encoding = 'iso-8859-1')
raw_data = pd.read_csv("/Users/Frankie/Documents/Dissertation/Data/Twitter/preprocessed/temp_df_nodup2.csv", encoding = 'iso-8859-1')
raw_data[:5]
Out[4]:
Convert the words into the features.
In [5]:
# use the scikit learn CountVectorizer function to preprocess the data
# parameters are set to:
# > binary - converts words to 0s and 1s
# > min_df - excludes any words in less that 10 documents
# > analyzer - count words not characters
# > ngram_range - extract uni and bigrams
features = raw_data["tweet"].as_matrix()
vec = CountVectorizer( binary = True, min_df = 10/features.shape[0], analyzer = 'word', ngram_range=(1, 2))
# transform the data
data_features = vec.fit_transform(features)
vocab = vec.get_feature_names()
In [6]:
data_features = pd.DataFrame(data_features.toarray(), columns = vocab)
data_features[:5]
Out[6]:
Convert the symptom into the label column.
In [7]:
raw_data['label'] = np.where(raw_data['symptom']=='Control', 0, 1)
label_df = raw_data.drop(['tweet'], axis=1)
label_df[:5]
Out[7]:
Merge label and features.
In [8]:
data_final = pd.merge(label_df, data_features, left_index=True, right_index=True)
data_final[:5]
Out[8]:
Save the file.
In [9]:
data_final.drop(['symptom'], axis=1).to_csv("/Users/Frankie/Desktop/pain.csv",index=False, header = False)